Import des librairies

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.image import imread

%matplotlib inline
pd.set_option('display.max_columns', 100)

import missingno as msno
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import plotly

from wordcloud import WordCloud,STOPWORDS, ImageColorGenerator

import cv2
import os
from os import path

# Tokenizer
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
nltk.download('punkt')

# Stop words
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer 

# Lemmatizer (base d'un mot)
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

from PIL import Image

from sklearn.decomposition import LatentDirichletAllocation
from sklearn.manifold import TSNE

from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn import metrics 

import gensim
from gensim.models import Word2Vec
from gensim.utils import tokenize

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import *
from tensorflow.keras.models import Model

import transformers
from transformers import TFAutoModel, AutoTokenizer

import tensorflow_hub as hub
import tensorflow_text
embed = hub.load("universal-sentence-encoder-large_5/")
#embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
import tensorflow as tens

import time

import warnings
warnings.filterwarnings('ignore')
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/oorvasisooprayen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!

Import dataset

In [2]:
data = pd.read_csv("Flipkart/flipkart_com-ecommerce_sample_1050.csv", encoding="utf-8-sig")

Description jeu de données

In [3]:
print("Le jeu de données flipkart_com-ecommerce_sample contient %d lignes et %d colonnes." % (data.shape[0], data.shape[1]))
Le jeu de données flipkart_com-ecommerce_sample contient 1050 lignes et 15 colonnes.
In [4]:
data.head()
Out[4]:
uniq_id crawl_timestamp product_url product_name product_category_tree pid retail_price discounted_price image is_FK_Advantage_product description product_rating overall_rating brand product_specifications
0 55b85ea15a1536d46b7190ad6fff8ce7 2016-04-30 03:22:56 +0000 http://www.flipkart.com/elegance-polyester-mul... Elegance Polyester Multicolor Abstract Eyelet ... ["Home Furnishing >> Curtains & Accessories >>... CRNEG7BKMFFYHQ8Z 1899.0 899.0 55b85ea15a1536d46b7190ad6fff8ce7.jpg False Key Features of Elegance Polyester Multicolor ... No rating available No rating available Elegance {"product_specification"=>[{"key"=>"Brand", "v...
1 7b72c92c2f6c40268628ec5f14c6d590 2016-04-30 03:22:56 +0000 http://www.flipkart.com/sathiyas-cotton-bath-t... Sathiyas Cotton Bath Towel ["Baby Care >> Baby Bath & Skin >> Baby Bath T... BTWEGFZHGBXPHZUH 600.0 449.0 7b72c92c2f6c40268628ec5f14c6d590.jpg False Specifications of Sathiyas Cotton Bath Towel (... No rating available No rating available Sathiyas {"product_specification"=>[{"key"=>"Machine Wa...
2 64d5d4a258243731dc7bbb1eef49ad74 2016-04-30 03:22:56 +0000 http://www.flipkart.com/eurospa-cotton-terry-f... Eurospa Cotton Terry Face Towel Set ["Baby Care >> Baby Bath & Skin >> Baby Bath T... BTWEG6SHXTDB2A2Y NaN NaN 64d5d4a258243731dc7bbb1eef49ad74.jpg False Key Features of Eurospa Cotton Terry Face Towe... No rating available No rating available Eurospa {"product_specification"=>[{"key"=>"Material",...
3 d4684dcdc759dd9cdf41504698d737d8 2016-06-20 08:49:52 +0000 http://www.flipkart.com/santosh-royal-fashion-... SANTOSH ROYAL FASHION Cotton Printed King size... ["Home Furnishing >> Bed Linen >> Bedsheets >>... BDSEJT9UQWHDUBH4 2699.0 1299.0 d4684dcdc759dd9cdf41504698d737d8.jpg False Key Features of SANTOSH ROYAL FASHION Cotton P... No rating available No rating available SANTOSH ROYAL FASHION {"product_specification"=>[{"key"=>"Brand", "v...
4 6325b6870c54cd47be6ebfbffa620ec7 2016-06-20 08:49:52 +0000 http://www.flipkart.com/jaipur-print-cotton-fl... Jaipur Print Cotton Floral King sized Double B... ["Home Furnishing >> Bed Linen >> Bedsheets >>... BDSEJTHNGWVGWWQU 2599.0 698.0 6325b6870c54cd47be6ebfbffa620ec7.jpg False Key Features of Jaipur Print Cotton Floral Kin... No rating available No rating available Jaipur Print {"product_specification"=>[{"key"=>"Machine Wa...
In [5]:
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1050 entries, 0 to 1049
Data columns (total 15 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   uniq_id                  1050 non-null   object 
 1   crawl_timestamp          1050 non-null   object 
 2   product_url              1050 non-null   object 
 3   product_name             1050 non-null   object 
 4   product_category_tree    1050 non-null   object 
 5   pid                      1050 non-null   object 
 6   retail_price             1049 non-null   float64
 7   discounted_price         1049 non-null   float64
 8   image                    1050 non-null   object 
 9   is_FK_Advantage_product  1050 non-null   bool   
 10  description              1050 non-null   object 
 11  product_rating           1050 non-null   object 
 12  overall_rating           1050 non-null   object 
 13  brand                    712 non-null    object 
 14  product_specifications   1049 non-null   object 
dtypes: bool(1), float64(2), object(12)
memory usage: 116.0+ KB
In [6]:
msno.matrix(data)
Out[6]:
<AxesSubplot:>
In [7]:
data.isna().sum()
Out[7]:
uniq_id                      0
crawl_timestamp              0
product_url                  0
product_name                 0
product_category_tree        0
pid                          0
retail_price                 1
discounted_price             1
image                        0
is_FK_Advantage_product      0
description                  0
product_rating               0
overall_rating               0
brand                      338
product_specifications       1
dtype: int64
In [8]:
data = data.dropna(subset=['retail_price', 'discounted_price', 'product_specifications'], how='all')

product_category_tree

In [9]:
#data["product_category_tree"].tolist()
In [10]:
data['product_category_tree'].nunique()
Out[10]:
642
In [11]:
data['product_category_tree'].str.split(">>", expand=True)
Out[11]:
0 1 2 3 4 5 6
0 ["Home Furnishing Curtains & Accessories Curtains Elegance Polyester Multicolor Abstract Eyelet... None None None
1 ["Baby Care Baby Bath & Skin Baby Bath Towels Sathiyas Baby Bath Towels Sathiyas Cotton Bath Towel (3 Bath Towel, Red... None None
2 ["Baby Care Baby Bath & Skin Baby Bath Towels Eurospa Baby Bath Towels Eurospa Cotton Terry Face Towel Set (20 PIECE... None None
3 ["Home Furnishing Bed Linen Bedsheets SANTOSH ROYAL FASHION Bedsheets SANTOSH ROYAL FASHION Cotton Printed King siz... None None
4 ["Home Furnishing Bed Linen Bedsheets Jaipur Print Bedsheets Jaipur Print Cotton Floral King sized Double ... None None
... ... ... ... ... ... ... ...
1045 ["Baby Care Baby & Kids Gifts Stickers Oren Empower Stickers"] None None None
1046 ["Baby Care Baby & Kids Gifts Stickers Wallmantra Stickers"] None None None
1047 ["Baby Care Baby & Kids Gifts Stickers Uberlyfe Stickers"] None None None
1048 ["Baby Care Baby & Kids Gifts Stickers Wallmantra Stickers"] None None None
1049 ["Baby Care Baby & Kids Gifts Stickers Uberlyfe Stickers"] None None None

1050 rows × 7 columns

In [12]:
data['product_category_tree'].str.split(">>", n=2, expand=True)
Out[12]:
0 1 2
0 ["Home Furnishing Curtains & Accessories Curtains >> Elegance Polyester Multicolor Abs...
1 ["Baby Care Baby Bath & Skin Baby Bath Towels >> Sathiyas Baby Bath Towels...
2 ["Baby Care Baby Bath & Skin Baby Bath Towels >> Eurospa Baby Bath Towels ...
3 ["Home Furnishing Bed Linen Bedsheets >> SANTOSH ROYAL FASHION Bedsheets ...
4 ["Home Furnishing Bed Linen Bedsheets >> Jaipur Print Bedsheets >> Jaipur...
... ... ... ...
1045 ["Baby Care Baby & Kids Gifts Stickers >> Oren Empower Stickers"]
1046 ["Baby Care Baby & Kids Gifts Stickers >> Wallmantra Stickers"]
1047 ["Baby Care Baby & Kids Gifts Stickers >> Uberlyfe Stickers"]
1048 ["Baby Care Baby & Kids Gifts Stickers >> Wallmantra Stickers"]
1049 ["Baby Care Baby & Kids Gifts Stickers >> Uberlyfe Stickers"]

1050 rows × 3 columns

In [13]:
data["product_category_1"] = data["product_category_tree"].apply(
    lambda x: x.split('["')[1].split('"]')[0].split(">>")[0]
)
In [14]:
data["product_category_2"] = data["product_category_tree"].apply(
    lambda x: x.split('["')[1].split('"]')[0].split(">>")[1]
)
In [15]:
data["product_category_3"] = data["product_category_tree"].apply(
    lambda x: x.split('["')[1].split('"]')[0].split(">>")[2]
    if len(x.split(">>")) > 2
    else ""
)
In [16]:
data.head()
Out[16]:
uniq_id crawl_timestamp product_url product_name product_category_tree pid retail_price discounted_price image is_FK_Advantage_product description product_rating overall_rating brand product_specifications product_category_1 product_category_2 product_category_3
0 55b85ea15a1536d46b7190ad6fff8ce7 2016-04-30 03:22:56 +0000 http://www.flipkart.com/elegance-polyester-mul... Elegance Polyester Multicolor Abstract Eyelet ... ["Home Furnishing >> Curtains & Accessories >>... CRNEG7BKMFFYHQ8Z 1899.0 899.0 55b85ea15a1536d46b7190ad6fff8ce7.jpg False Key Features of Elegance Polyester Multicolor ... No rating available No rating available Elegance {"product_specification"=>[{"key"=>"Brand", "v... Home Furnishing Curtains & Accessories Curtains
1 7b72c92c2f6c40268628ec5f14c6d590 2016-04-30 03:22:56 +0000 http://www.flipkart.com/sathiyas-cotton-bath-t... Sathiyas Cotton Bath Towel ["Baby Care >> Baby Bath & Skin >> Baby Bath T... BTWEGFZHGBXPHZUH 600.0 449.0 7b72c92c2f6c40268628ec5f14c6d590.jpg False Specifications of Sathiyas Cotton Bath Towel (... No rating available No rating available Sathiyas {"product_specification"=>[{"key"=>"Machine Wa... Baby Care Baby Bath & Skin Baby Bath Towels
2 64d5d4a258243731dc7bbb1eef49ad74 2016-04-30 03:22:56 +0000 http://www.flipkart.com/eurospa-cotton-terry-f... Eurospa Cotton Terry Face Towel Set ["Baby Care >> Baby Bath & Skin >> Baby Bath T... BTWEG6SHXTDB2A2Y NaN NaN 64d5d4a258243731dc7bbb1eef49ad74.jpg False Key Features of Eurospa Cotton Terry Face Towe... No rating available No rating available Eurospa {"product_specification"=>[{"key"=>"Material",... Baby Care Baby Bath & Skin Baby Bath Towels
3 d4684dcdc759dd9cdf41504698d737d8 2016-06-20 08:49:52 +0000 http://www.flipkart.com/santosh-royal-fashion-... SANTOSH ROYAL FASHION Cotton Printed King size... ["Home Furnishing >> Bed Linen >> Bedsheets >>... BDSEJT9UQWHDUBH4 2699.0 1299.0 d4684dcdc759dd9cdf41504698d737d8.jpg False Key Features of SANTOSH ROYAL FASHION Cotton P... No rating available No rating available SANTOSH ROYAL FASHION {"product_specification"=>[{"key"=>"Brand", "v... Home Furnishing Bed Linen Bedsheets
4 6325b6870c54cd47be6ebfbffa620ec7 2016-06-20 08:49:52 +0000 http://www.flipkart.com/jaipur-print-cotton-fl... Jaipur Print Cotton Floral King sized Double B... ["Home Furnishing >> Bed Linen >> Bedsheets >>... BDSEJTHNGWVGWWQU 2599.0 698.0 6325b6870c54cd47be6ebfbffa620ec7.jpg False Key Features of Jaipur Print Cotton Floral Kin... No rating available No rating available Jaipur Print {"product_specification"=>[{"key"=>"Machine Wa... Home Furnishing Bed Linen Bedsheets
In [17]:
data["product_category_1"]
Out[17]:
0       Home Furnishing 
1             Baby Care 
2             Baby Care 
3       Home Furnishing 
4       Home Furnishing 
              ...       
1045          Baby Care 
1046          Baby Care 
1047          Baby Care 
1048          Baby Care 
1049          Baby Care 
Name: product_category_1, Length: 1050, dtype: object
In [18]:
baby = data[data["product_category_1"] == "Baby Care "]
In [19]:
fig = go.Figure(go.Parcats(
    dimensions=[
        {'values': baby.product_category_1.values},
        {'values': baby.product_category_2.values},
        {'values': baby.product_category_3.values}]
))

fig.update_layout(
    title="Product Category 'Baby Care' décomposée"
)

fig.show()
In [20]:
home = data[data["product_category_1"] == "Home Furnishing "]
fig = go.Figure(go.Parcats(
    dimensions=[
        {'values': home.product_category_1.values},
        {'values': home.product_category_2.values},
        {'values': home.product_category_3.values}]
))

fig.update_layout(
    title="Product Category 'Home Furnishing' décomposée"
)

fig.show()
In [21]:
data = data.drop(['product_category_tree'], axis=1)
In [22]:
for col in ["product_category_1", "product_category_2", "product_category_3"]:
    print(f"Nombre de catégories pour la colonne {col} = {data[col].nunique()}")
    print(data[col].value_counts().sort_values(ascending=False))
    print("-" * 80)
Nombre de catégories pour la colonne product_category_1 = 7
Computers                      150
Home Furnishing                150
Watches                        150
Beauty and Personal Care       150
Kitchen & Dining               150
Baby Care                      150
Home Decor & Festive Needs     150
Name: product_category_1, dtype: int64
--------------------------------------------------------------------------------
Nombre de catégories pour la colonne product_category_2 = 63
 Wrist Watches               149
 Laptop Accessories           87
 Infant Wear                  84
 Coffee Mugs                  74
 Showpieces                   71
                            ... 
 Women's Hygiene               1
 JMD Home Furnishing           1
 Kripa's Home Furnishing       1
 Clocks                        1
 Housekeeping & Laundry        1
Name: product_category_2, Length: 63, dtype: int64
--------------------------------------------------------------------------------
Nombre de catégories pour la colonne product_category_3 = 247
 Deodorants                   65
 Blankets, Quilts & Dohars    56
 Baby Girls' Clothes          49
 Routers                      49
 USB Gadgets                  38
                              ..
 eCraftIndia Showpieces        1
 North Moon Wrist Watches      1
 T STAR Wrist Watches          1
 Cutlery                       1
 Timex Wrist Watches           1
Name: product_category_3, Length: 247, dtype: int64
--------------------------------------------------------------------------------
In [23]:
for elem in data['product_category_1'].unique():
    print('---',elem,'---')
    wordcloud = WordCloud(max_words=50).generate(' '.join(list(data[data['product_category_1']==elem]['description'])))
    
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.show()
                                                                   
--- Home Furnishing  ---
--- Baby Care  ---
--- Watches  ---
--- Home Decor & Festive Needs  ---
--- Kitchen & Dining  ---
--- Beauty and Personal Care  ---
--- Computers  ---

product_specifications

In [24]:
#data["product_specifications"].tolist()
In [25]:
# Fonction pour converture le contenu du champs product_specifications
def json_reformat(a):
    a="{"+a[26:-1].replace('\"key\"=>',"").replace('\"value\"=>',"").replace("\", \"","\":\"").replace("[","").replace("]","").replace("{","").replace("}","")+"}"
    return a

data["product_specifications"]=data["product_specifications"].apply(lambda x: json_reformat(x) if x is not np.nan else "")
In [26]:
data.head()
Out[26]:
uniq_id crawl_timestamp product_url product_name pid retail_price discounted_price image is_FK_Advantage_product description product_rating overall_rating brand product_specifications product_category_1 product_category_2 product_category_3
0 55b85ea15a1536d46b7190ad6fff8ce7 2016-04-30 03:22:56 +0000 http://www.flipkart.com/elegance-polyester-mul... Elegance Polyester Multicolor Abstract Eyelet ... CRNEG7BKMFFYHQ8Z 1899.0 899.0 55b85ea15a1536d46b7190ad6fff8ce7.jpg False Key Features of Elegance Polyester Multicolor ... No rating available No rating available Elegance {"Brand":"Elegance", "Designed For":"Door", "T... Home Furnishing Curtains & Accessories Curtains
1 7b72c92c2f6c40268628ec5f14c6d590 2016-04-30 03:22:56 +0000 http://www.flipkart.com/sathiyas-cotton-bath-t... Sathiyas Cotton Bath Towel BTWEGFZHGBXPHZUH 600.0 449.0 7b72c92c2f6c40268628ec5f14c6d590.jpg False Specifications of Sathiyas Cotton Bath Towel (... No rating available No rating available Sathiyas {"Machine Washable":"Yes", "Material":"Cotton"... Baby Care Baby Bath & Skin Baby Bath Towels
2 64d5d4a258243731dc7bbb1eef49ad74 2016-04-30 03:22:56 +0000 http://www.flipkart.com/eurospa-cotton-terry-f... Eurospa Cotton Terry Face Towel Set BTWEG6SHXTDB2A2Y NaN NaN 64d5d4a258243731dc7bbb1eef49ad74.jpg False Key Features of Eurospa Cotton Terry Face Towe... No rating available No rating available Eurospa {"Material":"Cotton Terry", "Design":"SHUVAM",... Baby Care Baby Bath & Skin Baby Bath Towels
3 d4684dcdc759dd9cdf41504698d737d8 2016-06-20 08:49:52 +0000 http://www.flipkart.com/santosh-royal-fashion-... SANTOSH ROYAL FASHION Cotton Printed King size... BDSEJT9UQWHDUBH4 2699.0 1299.0 d4684dcdc759dd9cdf41504698d737d8.jpg False Key Features of SANTOSH ROYAL FASHION Cotton P... No rating available No rating available SANTOSH ROYAL FASHION {"Brand":"SANTOSH ROYAL FASHION", "Machine Was... Home Furnishing Bed Linen Bedsheets
4 6325b6870c54cd47be6ebfbffa620ec7 2016-06-20 08:49:52 +0000 http://www.flipkart.com/jaipur-print-cotton-fl... Jaipur Print Cotton Floral King sized Double B... BDSEJTHNGWVGWWQU 2599.0 698.0 6325b6870c54cd47be6ebfbffa620ec7.jpg False Key Features of Jaipur Print Cotton Floral Kin... No rating available No rating available Jaipur Print {"Machine Washable":"Yes", "Brand":"Jaipur Pri... Home Furnishing Bed Linen Bedsheets

product_rating

In [27]:
data["product_rating"].unique()
Out[27]:
array(['No rating available', '5', '1', '2.3', '2.7', '4.5', '4', '3.8',
       '4.3', '3.7', '3.6', '3.5', '4.1', '4.9', '3.1', '4.2', '4.8',
       '4.4', '3.9', '3', '3.3', '2.5', '2', '4.7', '2.2', '3.2', '1.5'],
      dtype=object)
In [28]:
fig = px.bar(data["product_rating"].value_counts(),
             title="Nombre d'avis par produits")
fig.show()
In [29]:
fig = px.bar(data["overall_rating"].value_counts(),
             title="Nombre d'avis")
fig.show()
In [30]:
data = data.drop(['overall_rating'], axis=1)

brand

In [31]:
fig = px.bar(data["brand"].value_counts(),
             title="Marque des produits")
fig.show()

Nettoyage du texte

Etape 1: Remove punctuations

In [32]:
## Lowercase 
data['description'] = data['description'].apply(lambda x: " ".join(x.lower() for x in x.split()))

## remove punctuation
data['description'] = data['description'].str.replace('[^\w\s]','')

Etape 2: Tokenization

Cette tâche consiste à prendre une longue chaîne de texte et convertit chaque mot en un «jeton» ou une valeur et les place dans une liste. Les valeurs de la liste sont beaucoup plus faciles à manipuler par les étapes ultérieures.

In [33]:
def tokenize(column):
    tokens = nltk.word_tokenize(column)
    return [w for w in tokens if w.isalpha()]    
In [34]:
data['tokenized'] = data.apply(lambda x: tokenize(x['description']), axis=1)
data.head()
Out[34]:
uniq_id crawl_timestamp product_url product_name pid retail_price discounted_price image is_FK_Advantage_product description product_rating brand product_specifications product_category_1 product_category_2 product_category_3 tokenized
0 55b85ea15a1536d46b7190ad6fff8ce7 2016-04-30 03:22:56 +0000 http://www.flipkart.com/elegance-polyester-mul... Elegance Polyester Multicolor Abstract Eyelet ... CRNEG7BKMFFYHQ8Z 1899.0 899.0 55b85ea15a1536d46b7190ad6fff8ce7.jpg False key features of elegance polyester multicolor ... No rating available Elegance {"Brand":"Elegance", "Designed For":"Door", "T... Home Furnishing Curtains & Accessories Curtains [key, features, of, elegance, polyester, multi...
1 7b72c92c2f6c40268628ec5f14c6d590 2016-04-30 03:22:56 +0000 http://www.flipkart.com/sathiyas-cotton-bath-t... Sathiyas Cotton Bath Towel BTWEGFZHGBXPHZUH 600.0 449.0 7b72c92c2f6c40268628ec5f14c6d590.jpg False specifications of sathiyas cotton bath towel 3... No rating available Sathiyas {"Machine Washable":"Yes", "Material":"Cotton"... Baby Care Baby Bath & Skin Baby Bath Towels [specifications, of, sathiyas, cotton, bath, t...
2 64d5d4a258243731dc7bbb1eef49ad74 2016-04-30 03:22:56 +0000 http://www.flipkart.com/eurospa-cotton-terry-f... Eurospa Cotton Terry Face Towel Set BTWEG6SHXTDB2A2Y NaN NaN 64d5d4a258243731dc7bbb1eef49ad74.jpg False key features of eurospa cotton terry face towe... No rating available Eurospa {"Material":"Cotton Terry", "Design":"SHUVAM",... Baby Care Baby Bath & Skin Baby Bath Towels [key, features, of, eurospa, cotton, terry, fa...
3 d4684dcdc759dd9cdf41504698d737d8 2016-06-20 08:49:52 +0000 http://www.flipkart.com/santosh-royal-fashion-... SANTOSH ROYAL FASHION Cotton Printed King size... BDSEJT9UQWHDUBH4 2699.0 1299.0 d4684dcdc759dd9cdf41504698d737d8.jpg False key features of santosh royal fashion cotton p... No rating available SANTOSH ROYAL FASHION {"Brand":"SANTOSH ROYAL FASHION", "Machine Was... Home Furnishing Bed Linen Bedsheets [key, features, of, santosh, royal, fashion, c...
4 6325b6870c54cd47be6ebfbffa620ec7 2016-06-20 08:49:52 +0000 http://www.flipkart.com/jaipur-print-cotton-fl... Jaipur Print Cotton Floral King sized Double B... BDSEJTHNGWVGWWQU 2599.0 698.0 6325b6870c54cd47be6ebfbffa620ec7.jpg False key features of jaipur print cotton floral kin... No rating available Jaipur Print {"Machine Washable":"Yes", "Brand":"Jaipur Pri... Home Furnishing Bed Linen Bedsheets [key, features, of, jaipur, print, cotton, flo...

Etape 2: Stopword removal

Cette étape consiste à réduire le bruit dans nos données en supprimant les "Stopwords". Ce sont des mots spéciaux spécifiques à une langue qui apparaissent dans une phrase et qui ajoutent peu de valeur au sens. Les supprimer aide le modèle à voir les mots qui comptent.

In [35]:
nltk.download('stopwords')
nltk.download('wordnet')
#nltk.download('omw-1.4')
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/oorvasisooprayen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/oorvasisooprayen/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Out[35]:
True
In [36]:
def remove_stopwords(tokenized_column):
    stops = set(stopwords.words("english"))
    return [word for word in tokenized_column if not word in stops]
In [37]:
data['stopwords_removed'] = data.apply(lambda x: remove_stopwords(x['tokenized']), axis=1)
data.head()
Out[37]:
uniq_id crawl_timestamp product_url product_name pid retail_price discounted_price image is_FK_Advantage_product description product_rating brand product_specifications product_category_1 product_category_2 product_category_3 tokenized stopwords_removed
0 55b85ea15a1536d46b7190ad6fff8ce7 2016-04-30 03:22:56 +0000 http://www.flipkart.com/elegance-polyester-mul... Elegance Polyester Multicolor Abstract Eyelet ... CRNEG7BKMFFYHQ8Z 1899.0 899.0 55b85ea15a1536d46b7190ad6fff8ce7.jpg False key features of elegance polyester multicolor ... No rating available Elegance {"Brand":"Elegance", "Designed For":"Door", "T... Home Furnishing Curtains & Accessories Curtains [key, features, of, elegance, polyester, multi... [key, features, elegance, polyester, multicolo...
1 7b72c92c2f6c40268628ec5f14c6d590 2016-04-30 03:22:56 +0000 http://www.flipkart.com/sathiyas-cotton-bath-t... Sathiyas Cotton Bath Towel BTWEGFZHGBXPHZUH 600.0 449.0 7b72c92c2f6c40268628ec5f14c6d590.jpg False specifications of sathiyas cotton bath towel 3... No rating available Sathiyas {"Machine Washable":"Yes", "Material":"Cotton"... Baby Care Baby Bath & Skin Baby Bath Towels [specifications, of, sathiyas, cotton, bath, t... [specifications, sathiyas, cotton, bath, towel...
2 64d5d4a258243731dc7bbb1eef49ad74 2016-04-30 03:22:56 +0000 http://www.flipkart.com/eurospa-cotton-terry-f... Eurospa Cotton Terry Face Towel Set BTWEG6SHXTDB2A2Y NaN NaN 64d5d4a258243731dc7bbb1eef49ad74.jpg False key features of eurospa cotton terry face towe... No rating available Eurospa {"Material":"Cotton Terry", "Design":"SHUVAM",... Baby Care Baby Bath & Skin Baby Bath Towels [key, features, of, eurospa, cotton, terry, fa... [key, features, eurospa, cotton, terry, face, ...
3 d4684dcdc759dd9cdf41504698d737d8 2016-06-20 08:49:52 +0000 http://www.flipkart.com/santosh-royal-fashion-... SANTOSH ROYAL FASHION Cotton Printed King size... BDSEJT9UQWHDUBH4 2699.0 1299.0 d4684dcdc759dd9cdf41504698d737d8.jpg False key features of santosh royal fashion cotton p... No rating available SANTOSH ROYAL FASHION {"Brand":"SANTOSH ROYAL FASHION", "Machine Was... Home Furnishing Bed Linen Bedsheets [key, features, of, santosh, royal, fashion, c... [key, features, santosh, royal, fashion, cotto...
4 6325b6870c54cd47be6ebfbffa620ec7 2016-06-20 08:49:52 +0000 http://www.flipkart.com/jaipur-print-cotton-fl... Jaipur Print Cotton Floral King sized Double B... BDSEJTHNGWVGWWQU 2599.0 698.0 6325b6870c54cd47be6ebfbffa620ec7.jpg False key features of jaipur print cotton floral kin... No rating available Jaipur Print {"Machine Washable":"Yes", "Brand":"Jaipur Pri... Home Furnishing Bed Linen Bedsheets [key, features, of, jaipur, print, cotton, flo... [key, features, jaipur, print, cotton, floral,...

Etape 3: Stemming et Lemmatization

Bien que les deux techniques soient similaires, elles produisent des résultats différents. Il est donc important de déterminer celle qui convient à l'analyse qu'on souhaite effectuer.

Le stemming, le plus simple des deux, regroupe les mots par leur radical racine. Cela nous permet de reconnaître que 'jumping' 'jumps' et 'jumped' sont tous enracinés dans le même verbe (jump) et font donc référence à des problèmes similaires.

La lemmatisation, d'autre part, regroupe les mots en fonction de la définition de la racine et nous permet de différencier le présent, le passé et l'indéfini.

In [38]:
def apply_stemming(tokenized_column):
    stemmer = PorterStemmer() 
    return [stemmer.stem(word) for word in tokenized_column]
In [39]:
data['porter_stemmed'] = data.apply(lambda x: apply_stemming(x['stopwords_removed']), axis=1)
data.head()
Out[39]:
uniq_id crawl_timestamp product_url product_name pid retail_price discounted_price image is_FK_Advantage_product description product_rating brand product_specifications product_category_1 product_category_2 product_category_3 tokenized stopwords_removed porter_stemmed
0 55b85ea15a1536d46b7190ad6fff8ce7 2016-04-30 03:22:56 +0000 http://www.flipkart.com/elegance-polyester-mul... Elegance Polyester Multicolor Abstract Eyelet ... CRNEG7BKMFFYHQ8Z 1899.0 899.0 55b85ea15a1536d46b7190ad6fff8ce7.jpg False key features of elegance polyester multicolor ... No rating available Elegance {"Brand":"Elegance", "Designed For":"Door", "T... Home Furnishing Curtains & Accessories Curtains [key, features, of, elegance, polyester, multi... [key, features, elegance, polyester, multicolo... [key, featur, eleg, polyest, multicolor, abstr...
1 7b72c92c2f6c40268628ec5f14c6d590 2016-04-30 03:22:56 +0000 http://www.flipkart.com/sathiyas-cotton-bath-t... Sathiyas Cotton Bath Towel BTWEGFZHGBXPHZUH 600.0 449.0 7b72c92c2f6c40268628ec5f14c6d590.jpg False specifications of sathiyas cotton bath towel 3... No rating available Sathiyas {"Machine Washable":"Yes", "Material":"Cotton"... Baby Care Baby Bath & Skin Baby Bath Towels [specifications, of, sathiyas, cotton, bath, t... [specifications, sathiyas, cotton, bath, towel... [specif, sathiya, cotton, bath, towel, bath, t...
2 64d5d4a258243731dc7bbb1eef49ad74 2016-04-30 03:22:56 +0000 http://www.flipkart.com/eurospa-cotton-terry-f... Eurospa Cotton Terry Face Towel Set BTWEG6SHXTDB2A2Y NaN NaN 64d5d4a258243731dc7bbb1eef49ad74.jpg False key features of eurospa cotton terry face towe... No rating available Eurospa {"Material":"Cotton Terry", "Design":"SHUVAM",... Baby Care Baby Bath & Skin Baby Bath Towels [key, features, of, eurospa, cotton, terry, fa... [key, features, eurospa, cotton, terry, face, ... [key, featur, eurospa, cotton, terri, face, to...
3 d4684dcdc759dd9cdf41504698d737d8 2016-06-20 08:49:52 +0000 http://www.flipkart.com/santosh-royal-fashion-... SANTOSH ROYAL FASHION Cotton Printed King size... BDSEJT9UQWHDUBH4 2699.0 1299.0 d4684dcdc759dd9cdf41504698d737d8.jpg False key features of santosh royal fashion cotton p... No rating available SANTOSH ROYAL FASHION {"Brand":"SANTOSH ROYAL FASHION", "Machine Was... Home Furnishing Bed Linen Bedsheets [key, features, of, santosh, royal, fashion, c... [key, features, santosh, royal, fashion, cotto... [key, featur, santosh, royal, fashion, cotton,...
4 6325b6870c54cd47be6ebfbffa620ec7 2016-06-20 08:49:52 +0000 http://www.flipkart.com/jaipur-print-cotton-fl... Jaipur Print Cotton Floral King sized Double B... BDSEJTHNGWVGWWQU 2599.0 698.0 6325b6870c54cd47be6ebfbffa620ec7.jpg False key features of jaipur print cotton floral kin... No rating available Jaipur Print {"Machine Washable":"Yes", "Brand":"Jaipur Pri... Home Furnishing Bed Linen Bedsheets [key, features, of, jaipur, print, cotton, flo... [key, features, jaipur, print, cotton, floral,... [key, featur, jaipur, print, cotton, floral, k...
In [40]:
def apply_lemmatize(tokenized_column):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(word) for word in tokenized_column]
In [41]:
data['lemmatize'] = data.apply(lambda x: apply_lemmatize(x['stopwords_removed']), axis=1)
data.head()
Out[41]:
uniq_id crawl_timestamp product_url product_name pid retail_price discounted_price image is_FK_Advantage_product description product_rating brand product_specifications product_category_1 product_category_2 product_category_3 tokenized stopwords_removed porter_stemmed lemmatize
0 55b85ea15a1536d46b7190ad6fff8ce7 2016-04-30 03:22:56 +0000 http://www.flipkart.com/elegance-polyester-mul... Elegance Polyester Multicolor Abstract Eyelet ... CRNEG7BKMFFYHQ8Z 1899.0 899.0 55b85ea15a1536d46b7190ad6fff8ce7.jpg False key features of elegance polyester multicolor ... No rating available Elegance {"Brand":"Elegance", "Designed For":"Door", "T... Home Furnishing Curtains & Accessories Curtains [key, features, of, elegance, polyester, multi... [key, features, elegance, polyester, multicolo... [key, featur, eleg, polyest, multicolor, abstr... [key, feature, elegance, polyester, multicolor...
1 7b72c92c2f6c40268628ec5f14c6d590 2016-04-30 03:22:56 +0000 http://www.flipkart.com/sathiyas-cotton-bath-t... Sathiyas Cotton Bath Towel BTWEGFZHGBXPHZUH 600.0 449.0 7b72c92c2f6c40268628ec5f14c6d590.jpg False specifications of sathiyas cotton bath towel 3... No rating available Sathiyas {"Machine Washable":"Yes", "Material":"Cotton"... Baby Care Baby Bath & Skin Baby Bath Towels [specifications, of, sathiyas, cotton, bath, t... [specifications, sathiyas, cotton, bath, towel... [specif, sathiya, cotton, bath, towel, bath, t... [specification, sathiyas, cotton, bath, towel,...
2 64d5d4a258243731dc7bbb1eef49ad74 2016-04-30 03:22:56 +0000 http://www.flipkart.com/eurospa-cotton-terry-f... Eurospa Cotton Terry Face Towel Set BTWEG6SHXTDB2A2Y NaN NaN 64d5d4a258243731dc7bbb1eef49ad74.jpg False key features of eurospa cotton terry face towe... No rating available Eurospa {"Material":"Cotton Terry", "Design":"SHUVAM",... Baby Care Baby Bath & Skin Baby Bath Towels [key, features, of, eurospa, cotton, terry, fa... [key, features, eurospa, cotton, terry, face, ... [key, featur, eurospa, cotton, terri, face, to... [key, feature, eurospa, cotton, terry, face, t...
3 d4684dcdc759dd9cdf41504698d737d8 2016-06-20 08:49:52 +0000 http://www.flipkart.com/santosh-royal-fashion-... SANTOSH ROYAL FASHION Cotton Printed King size... BDSEJT9UQWHDUBH4 2699.0 1299.0 d4684dcdc759dd9cdf41504698d737d8.jpg False key features of santosh royal fashion cotton p... No rating available SANTOSH ROYAL FASHION {"Brand":"SANTOSH ROYAL FASHION", "Machine Was... Home Furnishing Bed Linen Bedsheets [key, features, of, santosh, royal, fashion, c... [key, features, santosh, royal, fashion, cotto... [key, featur, santosh, royal, fashion, cotton,... [key, feature, santosh, royal, fashion, cotton...
4 6325b6870c54cd47be6ebfbffa620ec7 2016-06-20 08:49:52 +0000 http://www.flipkart.com/jaipur-print-cotton-fl... Jaipur Print Cotton Floral King sized Double B... BDSEJTHNGWVGWWQU 2599.0 698.0 6325b6870c54cd47be6ebfbffa620ec7.jpg False key features of jaipur print cotton floral kin... No rating available Jaipur Print {"Machine Washable":"Yes", "Brand":"Jaipur Pri... Home Furnishing Bed Linen Bedsheets [key, features, of, jaipur, print, cotton, flo... [key, features, jaipur, print, cotton, floral,... [key, featur, jaipur, print, cotton, floral, k... [key, feature, jaipur, print, cotton, floral, ...
In [42]:
#pd.set_option('display.max_columns', None)
#pd.set_option('display.width', None)
#pd.set_option('display.max_colwidth', -1)
data[['tokenized','stopwords_removed','porter_stemmed','lemmatize']][:1].transpose()
Out[42]:
0
tokenized [key, features, of, elegance, polyester, multi...
stopwords_removed [key, features, elegance, polyester, multicolo...
porter_stemmed [key, featur, eleg, polyest, multicolor, abstr...
lemmatize [key, feature, elegance, polyester, multicolor...

Maintenant, assemblons les tokens traités

In [43]:
lemm = data['lemmatize']
In [44]:
for i in range(len(lemm)):
    lemm[i] = ' '.join(lemm[i])
In [45]:
stemm = data['porter_stemmed']
In [46]:
for i in range(len(stemm)):
    stemm[i] = ' '.join(stemm[i])

Bag of Words

Bag of Words est une représentation qui transforme un texte arbitraire en vecteurs de longueur fixe en comptant le nombre de fois où chaque mot apparaît.

CountVectorizer

CountVectorizer est utilisé pour convertir une collection de documents texte en un vecteur de nombre de termes

In [47]:
# création du bag of words (CountVectorizer)

cvect = CountVectorizer(stop_words='english', max_df=0.95, min_df=1)

lemmatize

In [48]:
cv_transform_lem = cvect.fit_transform(data['lemmatize'])  
In [49]:
features_names=cvect.get_feature_names()
word_fre_vect_lem=pd.DataFrame(cv_transform_lem.toarray(),columns=cvect.get_feature_names())
word_fre_vect_lem.head(5)
Out[49]:
aa aaa aapno aari aarika ability able abode abrasion abroad absolute absorbency absorbent absorber absorbing absorbs abstract abstrcts ac accent access accessory accident accidental accommodate accomplishes according acer ache aching acid acne acrylic act active actual adaa adapter adaptor add added addiction adding addition additional additionally adhesive adi adidas adino ... woven wow wowan wrap wrapped wrapper wring wringdo wrinkle wrinkleantishrinkage wrist write wrought xemex xl xp yardley yarn year yellow yellowblue yellowlittle yes ygs yiboo yield yk york youd young youre youth youthful youve yr yuva yves zaicus zero zikrak zinc zingalalaa zip zipexterior zipper zippered zone zoom zora zyxel
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
2 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

5 rows × 4709 columns

In [50]:
word_fre_vect_lem.shape
Out[50]:
(1050, 4709)
In [51]:
plt.figure(figsize=(10, 5))
occ = sns.barplot(data=word_fre_vect_lem.sum().sort_values(
    ascending=False).to_frame().head(30).T)
for item in occ.get_xticklabels():
    item.set_rotation(90)

Stemming

In [52]:
cv_transform_stem = cvect.fit_transform(data['porter_stemmed'])  
In [53]:
features_names=cvect.get_feature_names()
word_fre_vect_stem=pd.DataFrame(cv_transform_stem.toarray(),columns=cvect.get_feature_names())
word_fre_vect_stem.head(5)
Out[53]:
aa aaa aapno aari aarika abil abl abod abras abroad absolut absorb abstract abstrct ac accent access accessori accid accident accommod accomplish accord acer ach acid acn acryl act activ actual ad adaa adapt adaptor add addict addit adhes adi adida adino adjust admir ador adorn adsl advanc advic advis ... worn worri worshipp wouldnt woven wow wowan wrap wrapper wring wringdo wrinkl wrinkleantishrinkag wrist write wrought xemex xl xp yardley yarn ye year yellow yellowblu yellowlittl yg yiboo yield yk york youd young youth youv yr yuva yve zaicu zero zikrak zinc zingalalaa zip zipexterior zipper zone zoom zora zyxel
0 0 0 0 0 0 0 0 0 0 0 0 0 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
2 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

5 rows × 4172 columns

In [54]:
word_fre_vect_stem.shape
Out[54]:
(1050, 4172)
In [55]:
plt.figure(figsize=(10, 5))
occ = sns.barplot(data=word_fre_vect_stem.sum().sort_values(
    ascending=False).to_frame().head(30).T)
for item in occ.get_xticklabels():
    item.set_rotation(90)

TF-IDF

Le TF-IDF (pour Term Frequency et Inverse Document Frequency) est une mesure utilisée pour déterminer la pertinence d’un terme dans un document. Cette méthode prend en compte non seulement l'occurrence d'un mot dans une seule description mais aussi dans le corpus entier. TF-IDF fonctionne en pénalisant les mots communs en leur attribuant des poids inférieurs tout en donnant de l'importance aux mots qui sont rares dans l'ensemble du corpus mais qui apparaissent en bon nombre dans peu de commentaires.

Termes importants liés à TF-IDF:

  • TF = (Nombre de fois que le terme t apparaît dans un document) / (Nombre de termes dans le document)
  • IDF = log (N / n), où, N est le nombre de documents et n est le nombre de documents dans lesquels un terme t est apparu.
  • TF-IDF = TF * IDF

La formule prend en compte la fréquence d’un terme (TF) dans un document donné ainsi que le nombre de documents contenant ce mot (IDF)

In [56]:
# création du bag of words (Tf-idf)

ctf = TfidfVectorizer(stop_words='english', max_df=0.95, min_df=1)

lemmatize

In [57]:
ctf_transform_lem = ctf.fit_transform(data['lemmatize'])
In [58]:
features_names=ctf.get_feature_names()
word_fre_tfidf_lem=pd.DataFrame(ctf_transform_lem.toarray(),columns=ctf.get_feature_names())
word_fre_tfidf_lem.head(5)
Out[58]:
aa aaa aapno aari aarika ability able abode abrasion abroad absolute absorbency absorbent absorber absorbing absorbs abstract abstrcts ac accent access accessory accident accidental accommodate accomplishes according acer ache aching acid acne acrylic act active actual adaa adapter adaptor add added addiction adding addition additional additionally adhesive adi adidas adino ... woven wow wowan wrap wrapped wrapper wring wringdo wrinkle wrinkleantishrinkage wrist write wrought xemex xl xp yardley yarn year yellow yellowblue yellowlittle yes ygs yiboo yield yk york youd young youre youth youthful youve yr yuva yves zaicus zero zikrak zinc zingalalaa zip zipexterior zipper zippered zone zoom zora zyxel
0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.0 0.0 0.0 0.0 0.177632 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.077718 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.000000 0.0 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.0 0.0 0.0 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.179174 0.0 0.0 0.069071 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.068951 0.0 0.0 0.0 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.042664 0.000000 0.0 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
3 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.0 0.0 0.0 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.000000 0.0 0.0 0.043112 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
4 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.0 0.0 0.0 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.000000 0.0 0.0 0.035936 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

5 rows × 4709 columns

In [59]:
word_fre_tfidf_lem.shape
Out[59]:
(1050, 4709)
In [60]:
plt.figure(figsize=(10, 5))
occ = sns.barplot(data=word_fre_tfidf_lem.sum().sort_values(
    ascending=False).to_frame().head(30).T)
for item in occ.get_xticklabels():
    item.set_rotation(90)

stemmming

In [61]:
ctf_transform_stem = ctf.fit_transform(data['porter_stemmed'])  
In [62]:
features_names=ctf.get_feature_names()
word_fre_tfidf_stem=pd.DataFrame(ctf_transform_stem.toarray(),columns=ctf.get_feature_names())
word_fre_tfidf_stem.head(5)
Out[62]:
aa aaa aapno aari aarika abil abl abod abras abroad absolut absorb abstract abstrct ac accent access accessori accid accident accommod accomplish accord acer ach acid acn acryl act activ actual ad adaa adapt adaptor add addict addit adhes adi adida adino adjust admir ador adorn adsl advanc advic advis ... worn worri worshipp wouldnt woven wow wowan wrap wrapper wring wringdo wrinkl wrinkleantishrinkag wrist write wrought xemex xl xp yardley yarn ye year yellow yellowblu yellowlittl yg yiboo yield yk york youd young youth youv yr yuva yve zaicu zero zikrak zinc zingalalaa zip zipexterior zipper zone zoom zora zyxel
0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.178258 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.077992 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.000000 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.069134 0.000000 0.179337 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.056874 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.042683 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
3 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.044213 0.000000 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
4 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.036775 0.000000 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

5 rows × 4172 columns

In [63]:
word_fre_tfidf_stem.shape
Out[63]:
(1050, 4172)
In [64]:
plt.figure(figsize=(10, 5))
occ = sns.barplot(data=word_fre_tfidf_stem.sum().sort_values(
    ascending=False).to_frame().head(30).T)
for item in occ.get_xticklabels():
    item.set_rotation(90)

Latent Dirichlet Allocation (LDA)

Le LDA est un algorithme d'apprentissage qui cherche à maximiser la variance inter-classes par rapport à celle intra classe. Le LDA permets de faire une modélisation de sujets (topics) pour classer le texte d'un document dans un sujet particulier.

In [65]:
# Creation of LDA model

lda_model = LatentDirichletAllocation(
    n_components=10, 
    max_iter=10, 
    learning_method='online', 
    learning_offset=10.,
    random_state=42)
In [66]:
def lda(vector):
    LDA = lda_model.fit_transform(vector)
    return LDA
In [67]:
def display_topics(lda_model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(lda_model.components_):
        print("Topic {}:".format(topic_idx))
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))
In [68]:
lda_tf_lem = lda(word_fre_vect_lem)
lda_tf_idf_lem = lda(word_fre_tfidf_lem)
lda_tf_stem = lda(word_fre_vect_stem)
lda_tf_idf_stem = lda(word_fre_tfidf_stem)

Présentation des 10 premiers topics obtenu par LatentDirichletAllocation sur CountVectorizer

In [69]:
display_topics(lda_model, cvect.get_feature_names(), 3)
Topic 0:
hair brush condition
Topic 1:
kadhai metallino kalash
Topic 2:
tawa dandruff hors
Topic 3:
analog watch women
Topic 4:
salwar kurti salli
Topic 5:
seater open prime
Topic 6:
flipkartcom product free
Topic 7:
shadow buddha jug
Topic 8:
showpiec cm best
Topic 9:
recrafto beyouti dhol

PCA

In [70]:
# PCA Pipeline
pca = PCA(svd_solver='full')
data_pca = pca.fit_transform(word_fre_vect_lem)
In [71]:
# Explained variance
varexpl = pca.explained_variance_ratio_*100

# Plot of cumulated variance
plt.figure(figsize=(12,8))
plt.bar(np.arange(len(varexpl))+1, varexpl)

cumSumVar = varexpl.cumsum()
plt.plot(np.arange(len(varexpl))+1, cumSumVar,c="red",marker='o')

valid_idx = np.where(cumSumVar >= 95)[0]
min_plans = valid_idx[cumSumVar[valid_idx].argmin()]+1


plt.xlabel("rang de l'axe d'inertie")
plt.xticks(np.arange(len(varexpl))+1)
plt.ylabel("pourcentage d'inertie")
plt.title("{}% de la variance totale est expliquée"\
          " par les {} premiers axes".format(95,
                                            min_plans))
plt.show(block=False)
In [72]:
def pca(vector):
    pca = PCA(n_components=0.95)
    ft_pca = pca.fit_transform(vector)
    
    return ft_pca
In [73]:
pca_tf_lem = pca(word_fre_vect_lem)
pca_tf_idf_lem = pca(word_fre_tfidf_lem)
pca_tf_stem = pca(word_fre_vect_stem)
pca_tf_idf_stem = pca(word_fre_tfidf_stem)

t-distributed stochastic neighbor embedding (TSNE)

La t-SNE est une technique de réduction de dimension non linéaire non supervisée. Il intègre (Embedding) les points d'une dimension supérieure à une dimension inférieure en essayant de préserver le voisinage de ce point.

In [74]:
tsne = TSNE(n_components=2, verbose=1, perplexity=80,n_iter=5000, learning_rate=200, random_state=42)
In [75]:
def TSNE (dimension):
    res_tsne = tsne.fit_transform(dimension)
    res_tsne_df = pd.DataFrame(res_tsne, columns=['tsne1', 'tsne2'])
    return res_tsne_df
In [76]:
tsne_pca_tf_lem = TSNE(pca_tf_lem)
tsne_pca_tf_idf_lem = TSNE(pca_tf_idf_lem)

tsne_lda_tf_lem = TSNE(lda_tf_lem)
tsne_lda_tf_idf_lem = TSNE(lda_tf_idf_lem)

tsne_pca_tf_stem = TSNE(pca_tf_stem)
tsne_pca_tf_idf_stem = TSNE(pca_tf_idf_stem)

tsne_lda_tf_stem = TSNE(lda_tf_stem)
tsne_lda_tf_idf_stem = TSNE(lda_tf_idf_stem)
[t-SNE] Computing 241 nearest neighbors...
[t-SNE] Indexed 1050 samples in 0.010s...
[t-SNE] Computed neighbors for 1050 samples in 0.743s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1050
[t-SNE] Computed conditional probabilities for sample 1050 / 1050
[t-SNE] Mean sigma: 1.922795
[t-SNE] KL divergence after 250 iterations with early exaggeration: 61.509449
[t-SNE] KL divergence after 2100 iterations: 0.609421
[t-SNE] Computing 241 nearest neighbors...
[t-SNE] Indexed 1050 samples in 0.024s...
[t-SNE] Computed neighbors for 1050 samples in 1.345s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1050
[t-SNE] Computed conditional probabilities for sample 1050 / 1050
[t-SNE] Mean sigma: 0.426164
[t-SNE] KL divergence after 250 iterations with early exaggeration: 65.658752
[t-SNE] KL divergence after 3350 iterations: 0.767136
[t-SNE] Computing 241 nearest neighbors...
[t-SNE] Indexed 1050 samples in 0.001s...
[t-SNE] Computed neighbors for 1050 samples in 0.072s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1050
[t-SNE] Computed conditional probabilities for sample 1050 / 1050
[t-SNE] Mean sigma: 0.008344
[t-SNE] KL divergence after 250 iterations with early exaggeration: 51.170494
[t-SNE] KL divergence after 1800 iterations: 0.244980
[t-SNE] Computing 241 nearest neighbors...
[t-SNE] Indexed 1050 samples in 0.001s...
[t-SNE] Computed neighbors for 1050 samples in 0.062s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1050
[t-SNE] Computed conditional probabilities for sample 1050 / 1050
[t-SNE] Mean sigma: 0.007359
[t-SNE] KL divergence after 250 iterations with early exaggeration: 46.984055
[t-SNE] KL divergence after 1450 iterations: 0.152351
[t-SNE] Computing 241 nearest neighbors...
[t-SNE] Indexed 1050 samples in 0.013s...
[t-SNE] Computed neighbors for 1050 samples in 0.809s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1050
[t-SNE] Computed conditional probabilities for sample 1050 / 1050
[t-SNE] Mean sigma: 1.958204
[t-SNE] KL divergence after 250 iterations with early exaggeration: 62.910698
[t-SNE] KL divergence after 3200 iterations: 0.612920
[t-SNE] Computing 241 nearest neighbors...
[t-SNE] Indexed 1050 samples in 0.024s...
[t-SNE] Computed neighbors for 1050 samples in 1.410s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1050
[t-SNE] Computed conditional probabilities for sample 1050 / 1050
[t-SNE] Mean sigma: 0.429271
[t-SNE] KL divergence after 250 iterations with early exaggeration: 64.398315
[t-SNE] KL divergence after 3900 iterations: 0.769711
[t-SNE] Computing 241 nearest neighbors...
[t-SNE] Indexed 1050 samples in 0.001s...
[t-SNE] Computed neighbors for 1050 samples in 0.077s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1050
[t-SNE] Computed conditional probabilities for sample 1050 / 1050
[t-SNE] Mean sigma: 0.011255
[t-SNE] KL divergence after 250 iterations with early exaggeration: 51.948486
[t-SNE] KL divergence after 2550 iterations: 0.248107
[t-SNE] Computing 241 nearest neighbors...
[t-SNE] Indexed 1050 samples in 0.001s...
[t-SNE] Computed neighbors for 1050 samples in 0.051s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1050
[t-SNE] Computed conditional probabilities for sample 1050 / 1050
[t-SNE] Mean sigma: 0.004547
[t-SNE] KL divergence after 250 iterations with early exaggeration: 46.397511
[t-SNE] KL divergence after 3850 iterations: 0.147436

K-Means

In [77]:
list_ari = []
def plot_kmeans_tsne(reduction, title, filename, colname):

    kmeans_tsne = KMeans(n_clusters=7, n_init=50, max_iter=200,init='k-means++', random_state=42).fit(reduction)
    labels_tsne = kmeans_tsne.labels_
    cl_tsne = pd.concat([reduction,pd.DataFrame({'tsne_clusters':labels_tsne})],axis=1)
    
    data[f'cluster {colname}'] = labels_tsne
    categories_predict = data[f'cluster {colname}']
    categories_true = data['product_category_1']
    adjusted_rand = metrics.adjusted_rand_score(categories_true, categories_predict)
    list_ari.append(adjusted_rand)
    print("\033[1mAdjusted Rand Index: %0.3f\033[0m" % adjusted_rand)
    
    fig = px.scatter(data, x=cl_tsne.iloc[:,0], y = cl_tsne.iloc[:,1], color=categories_true, title=f"Représentation selon les vraies classes {title}")
    
    fig1 = px.scatter(data, x = cl_tsne.iloc[:,0],y = cl_tsne.iloc[:,1], color=categories_predict, title = f"Représentation selon les clusters {title}")
    
    plotly.offline.plot(fig, filename=f'plots/{filename}.html')
    plotly.offline.plot(fig1, filename=f'plots/{filename}_cluster.html')

    return fig.show(), fig1.show()
In [78]:
#visualizing CountVectorizer bag of words with PCA reduction by using 2D TSNE
plot_kmeans_tsne(tsne_pca_tf_lem,
                 "Cluster Kmeans based on lemmatized CountVectorizer with PCA (TSNE)",
                "Kmeans_CountVect_lem_PCA_tsne","cvec_lem_pca_tsne") 
Adjusted Rand Index: 0.353
Out[78]:
(None, None)
Le graphique représente les produits classifiés en réel et ce que le modèle kmeans prédit.  On voit que c'est assez mal classifié.
In [79]:
# Analyse des différentes catégories dans les labels
index_tot = [data[data['cluster cvec_lem_pca_tsne'] == x].index
             for x in data['cluster cvec_lem_pca_tsne'].value_counts().index]

plt.figure(figsize=(20, 20))
for x in range(len(index_tot)):
    order = data.loc[index_tot[x], 'product_category_1'].value_counts()
    order_hue = order.index
    plt.subplot(4, len(index_tot)/3, x+1)
    sns.countplot(y=data.loc[index_tot[x], 'product_category_1'],
                  order=order_hue,
                  palette='Blues_r')
    plt.title(f"Cluster {x}", fontsize=20)
Par rapport à la répartition des clusters, on peut voir que les montres sont très bien reconnues par le modèle mais sues plusieurs autres produits, le modèle n'arrive pas à reconnaitre la catégorie des produits
In [80]:
#visualizing TF-IDF bag of words with PCA reduction by using 2D TSNE
plot_kmeans_tsne(tsne_pca_tf_idf_lem,
                 "Cluster Kmeans based on lemmatized TF-IDF with PCA (TSNE)",
                "Kmeans_TFIDF_lem_PCA", "TFIDF_lem_PCA") 
Adjusted Rand Index: 0.346
Out[80]:
(None, None)
Ici aussi on peut voir que les produits sont mal classifié
In [81]:
# Analyse des différentes catégories dans les labels
index_tot = [data[data['cluster TFIDF_lem_PCA'] == x].index
             for x in data['cluster TFIDF_lem_PCA'].value_counts().index]

plt.figure(figsize=(20, 20))
for x in range(len(index_tot)):
    order = data.loc[index_tot[x], 'product_category_1'].value_counts()
    order_hue = order.index
    plt.subplot(4, len(index_tot)/3, x+1)
    sns.countplot(y=data.loc[index_tot[x], 'product_category_1'],
                  order=order_hue,
                  palette='Blues_r')
    plt.title(f"Cluster {x}", fontsize=20)
In [82]:
#visualizing CountVectorizer bag of words with LDA reduction by using 2D TSNE
plot_kmeans_tsne(tsne_lda_tf_lem,
                 "Cluster Kmeans based on lemmatized CountVectorizer with LDA (TSNE)",
                "Kmeans_CountVec_lem_LDA","CVec_lem_LDA") 
Adjusted Rand Index: 0.186
Out[82]:
(None, None)
In [83]:
# Analyse des différentes catégories dans les labels
index_tot = [data[data['cluster CVec_lem_LDA'] == x].index
             for x in data['cluster CVec_lem_LDA'].value_counts().index]

plt.figure(figsize=(20, 20))
for x in range(len(index_tot)):
    order = data.loc[index_tot[x], 'product_category_1'].value_counts()
    order_hue = order.index
    plt.subplot(4, len(index_tot)/3, x+1)
    sns.countplot(y=data.loc[index_tot[x], 'product_category_1'],
                  order=order_hue,
                  palette='Blues_r')
    plt.title(f"Cluster {x}", fontsize=20)
In [84]:
list_ari2 = []
def plot_kmeans_tsne(reduction, title, filename, colname):

    kmeans_tsne = KMeans(n_clusters=7, n_init=50, max_iter=200,init='k-means++', random_state=42).fit(reduction)
    labels_tsne = kmeans_tsne.labels_
    cl_tsne = pd.concat([reduction,pd.DataFrame({'tsne_clusters':labels_tsne})],axis=1)
    
    data[f'cluster {colname}'] = labels_tsne
    categories_predict = data[f'cluster {colname}']
    categories_true = data['product_category_1']
    adjusted_rand = metrics.adjusted_rand_score(categories_true, categories_predict)
    list_ari2.append(adjusted_rand)
    print("\033[1mAdjusted Rand Index: %0.3f\033[0m" % adjusted_rand)
    
    sns.scatterplot(data = data, x=cl_tsne.iloc[:,0], y = cl_tsne.iloc[:,1], hue=categories_true)
    
    plt.title(f"Représentation selon les classes {title}")

    return plt.show()
In [85]:
#visualizing TF-IDF bag of words with LDA reduction by using 2D TSNE
plt.figure(figsize=(15,8))

plot_kmeans_tsne(tsne_lda_tf_idf_lem,
                 "Cluster Kmeans based on lemmatized TF-IDF with LDA (TSNE)",
                "Kmeans_TFIDF_lem_LDA", "TFIDF_lem_LDA") 
Adjusted Rand Index: 0.107
In [86]:
# Analyse des différentes catégories dans les labels
index_tot = [data[data['cluster TFIDF_lem_LDA'] == x].index
             for x in data['cluster TFIDF_lem_LDA'].value_counts().index]

plt.figure(figsize=(20, 20))
for x in range(len(index_tot)):
    order = data.loc[index_tot[x], 'product_category_1'].value_counts()
    order_hue = order.index
    plt.subplot(4, len(index_tot)/3, x+1)
    sns.countplot(y=data.loc[index_tot[x], 'product_category_1'],
                  order=order_hue,
                  palette='Blues_r')
    plt.title(f"Cluster {x}", fontsize=20)
In [87]:
#visualizing CountVectorizer bag of words with PCA reduction by using 2D TSNE
plt.figure(figsize=(15,8))

plot_kmeans_tsne(tsne_pca_tf_stem,
                 "Cluster Kmeans based on stemmed CountVectorizer with PCA (TSNE)",
                "Kmeans_CountVec_stem_PCA", "CountVec_stem_PCA") 
Adjusted Rand Index: 0.322
In [88]:
# Analyse des différentes catégories dans les labels
index_tot = [data[data['cluster CountVec_stem_PCA'] == x].index
             for x in data['cluster CountVec_stem_PCA'].value_counts().index]

plt.figure(figsize=(20, 20))
for x in range(len(index_tot)):
    order = data.loc[index_tot[x], 'product_category_1'].value_counts()
    order_hue = order.index
    plt.subplot(4, len(index_tot)/3, x+1)
    sns.countplot(y=data.loc[index_tot[x], 'product_category_1'],
                  order=order_hue,
                  palette='Blues_r')
    plt.title(f"Cluster {x}", fontsize=20)
In [89]:
#visualizing TF-IDF bag of words with PCA reduction by using 2D TSNE
plt.figure(figsize=(15,8))

plot_kmeans_tsne(tsne_pca_tf_idf_stem,
                 "Cluster Kmeans based on stemmed TF-IDF with PCA (TSNE)",
                "Kmeans_TFIDF_stem_PCA", "TFIDF_stem_PCA") 
Adjusted Rand Index: 0.395
In [90]:
# Analyse des différentes catégories dans les labels
index_tot = [data[data['cluster TFIDF_stem_PCA'] == x].index
             for x in data['cluster TFIDF_stem_PCA'].value_counts().index]

plt.figure(figsize=(20, 20))
for x in range(len(index_tot)):
    order = data.loc[index_tot[x], 'product_category_1'].value_counts()
    order_hue = order.index
    plt.subplot(4, len(index_tot)/3, x+1)
    sns.countplot(y=data.loc[index_tot[x], 'product_category_1'],
                  order=order_hue,
                  palette='Blues_r')
    plt.title(f"Cluster {x}", fontsize=20)
In [91]:
#visualizing CountVectorizer bag of words with LDA reduction by using 2D TSNE
plt.figure(figsize=(15,8))

plot_kmeans_tsne(tsne_lda_tf_stem,
                 "Cluster Kmeans based on stemmed CountVectorizer with LDA (TSNE)",
                "Kmeans_countVec_stem_LDA", "countVec_stem_LDA") 
Adjusted Rand Index: 0.157
In [92]:
# Analyse des différentes catégories dans les labels
index_tot = [data[data['cluster countVec_stem_LDA'] == x].index
             for x in data['cluster countVec_stem_LDA'].value_counts().index]

plt.figure(figsize=(20, 20))
for x in range(len(index_tot)):
    order = data.loc[index_tot[x], 'product_category_1'].value_counts()
    order_hue = order.index
    plt.subplot(4, len(index_tot)/3, x+1)
    sns.countplot(y=data.loc[index_tot[x], 'product_category_1'],
                  order=order_hue,
                  palette='Blues_r')
    plt.title(f"Cluster {x}", fontsize=20)
In [93]:
#visualizing CountVectorizer bag of words with LDA reduction by using 2D TSNE
plt.figure(figsize=(15,8))

plot_kmeans_tsne(tsne_lda_tf_idf_lem,
                 "Cluster Kmeans based on stemmed TF-IDF with LDA (TSNE)",
                "Kmeans_TFIDF_stem_LDA", "TFIDF_stem_LDA") 
Adjusted Rand Index: 0.107
In [94]:
# Analyse des différentes catégories dans les labels
index_tot = [data[data['cluster TFIDF_stem_LDA'] == x].index
             for x in data['cluster TFIDF_stem_LDA'].value_counts().index]

plt.figure(figsize=(20, 20))
for x in range(len(index_tot)):
    order = data.loc[index_tot[x], 'product_category_1'].value_counts()
    order_hue = order.index
    plt.subplot(4, len(index_tot)/3, x+1)
    sns.countplot(y=data.loc[index_tot[x], 'product_category_1'],
                  order=order_hue,
                  palette='Blues_r')
    plt.title(f"Cluster {x}", fontsize=20)

Word embedding

Word embedding are also a very popular way to approach an NLP problem in which words are converted into vectors and used in various ML and deeplearning models.

Word2Vec

Word2Vec est un algorithme non supervisé qui utilise un réseau de neurones à 3 couches (1 couche d’entrée, 1 couche cachée, 1 couche de sortie). Word2Vec génère des plongements indépendants du contexte : c'est-à-dire qu'il n'y a qu'une seule représentation vectorielle pour chaque mot.

lemmatize

In [95]:
w2v_size=300
w2v_window=5
w2v_min_count=1
w2v_epochs=100
maxlen = 2524 # adapt to length of sentences
sentences = data['lemmatize'].to_list()
sentences = [gensim.utils.simple_preprocess(text) for text in sentences]
In [96]:
# Création et entraînement du modèle Word2Vec

print("Build & train Word2Vec model ...")
w2v_model = Word2Vec(min_count=w2v_min_count, window=w2v_window, vector_size=w2v_size, seed=42, workers=1, epochs=w2v_epochs)
w2v_model.build_vocab(sentences)
w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=w2v_model.epochs)
model_vectors = w2v_model.wv
w2v_words = model_vectors.index_to_key
print("Vocabulary size: %i" % len(w2v_words))
print("Word2Vec trained")

#print('words:',w2v_words)
Build & train Word2Vec model ...
Vocabulary size: 4637
Word2Vec trained
In [97]:
# Préparation des sentences (tokenization)

print("Fit Tokenizer ...")
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)
x_sentences = pad_sequences(tokenizer.texts_to_sequences(sentences),
                                                     maxlen=maxlen,
                                                     padding='post') 
                                                   
num_words = len(tokenizer.word_index) + 1
print("Number of unique words: %i" % num_words)
Fit Tokenizer ...
Number of unique words: 4638
In [98]:
plt.figure(num=None, figsize=(15, 10), facecolor='w', edgecolor='k')

# fit a 2d PCA model to the vectors
X = model_vectors[model_vectors.index_to_key]
pca = PCA(n_components=2)
result = pca.fit_transform(X)

# # create a scatter plot of the projection
plt.scatter(result[:, 0], result[:, 1])
words = list(model_vectors.index_to_key)

for i, word in enumerate(words):
    plt.annotate(word, xy=(result[i, 0], result[i, 1]))

plt.show()

Création de la matrice d'embedding

In [99]:
# Création de la matrice d'embedding

print("Create Embedding matrix ...")
w2v_size = 300
word_index = tokenizer.word_index
vocab_size = len(word_index) + 1
embedding_matrix = np.zeros((vocab_size, w2v_size))
i=0
j=0
    
for word, idx in word_index.items():
    i +=1
    if word in w2v_words:
        j +=1
        embedding_vector = model_vectors[word]
        if embedding_vector is not None:
            embedding_matrix[idx] = model_vectors[word]
            
word_rate = np.round(j/i,4)
print("Word embedding rate : ", word_rate)
print("Embedding matrix: %s" % str(embedding_matrix.shape))
Create Embedding matrix ...
Word embedding rate :  1.0
Embedding matrix: (4638, 300)
In [100]:
# Création du modèle

input=Input(shape=(len(x_sentences),maxlen),dtype='float64')
word_input=Input(shape=(maxlen,),dtype='float64')  
word_embedding=Embedding(input_dim=vocab_size,
                         output_dim=w2v_size,
                         weights = [embedding_matrix],
                         input_length=maxlen)(word_input)
word_vec=GlobalAveragePooling1D()(word_embedding)  
embed_model = Model([word_input],word_vec)

embed_model.summary()
Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 input_2 (InputLayer)        [(None, 2524)]            0         
                                                                 
 embedding (Embedding)       (None, 2524, 300)         1391400   
                                                                 
 global_average_pooling1d (G  (None, 300)              0         
 lobalAveragePooling1D)                                          
                                                                 
=================================================================
Total params: 1,391,400
Trainable params: 1,391,400
Non-trainable params: 0
_________________________________________________________________
In [101]:
embeddings = embed_model.predict(x_sentences)
embeddings.shape
Out[101]:
(1050, 300)
In [102]:
X_tsne_w2v = tsne.fit_transform(embeddings)

df_tsne_w2v = pd.DataFrame(X_tsne_w2v[:,0:2], columns=['tsne1', 'tsne2'])
print(df_tsne_w2v.shape)
[t-SNE] Computing 241 nearest neighbors...
[t-SNE] Indexed 1050 samples in 0.012s...
[t-SNE] Computed neighbors for 1050 samples in 0.709s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1050
[t-SNE] Computed conditional probabilities for sample 1050 / 1050
[t-SNE] Mean sigma: 0.020349
[t-SNE] KL divergence after 250 iterations with early exaggeration: 54.291801
[t-SNE] KL divergence after 1900 iterations: 0.341102
(1050, 2)
In [103]:
plt.figure(figsize=(15,8))

plot_kmeans_tsne(df_tsne_w2v,
                 "Cluster Kmeans based on lemmatized word2vec",
                "Kmeans_word2vec_lemmatize", "word2vec_lem") 
Adjusted Rand Index: 0.381
In [104]:
# Analyse des différentes catégories dans les labels
index_tot = [data[data['cluster word2vec_lem'] == x].index
             for x in data['cluster word2vec_lem'].value_counts().index]

plt.figure(figsize=(20, 20))
for x in range(len(index_tot)):
    order = data.loc[index_tot[x], 'product_category_1'].value_counts()
    order_hue = order.index
    plt.subplot(4, len(index_tot)/3, x+1)
    sns.countplot(y=data.loc[index_tot[x], 'product_category_1'],
                  order=order_hue,
                  palette='Blues_r')
    plt.title(f"Cluster {x}", fontsize=20)

stemming

In [105]:
sentences_stem = data['porter_stemmed'].to_list()
sentences_stem = [gensim.utils.simple_preprocess(text) for text in sentences_stem]
In [106]:
# Création et entraînement du modèle Word2Vec

print("Build & train Word2Vec model ...")
w2v_model = Word2Vec(min_count=w2v_min_count, window=w2v_window, vector_size=w2v_size, seed=42, workers=1, epochs=w2v_epochs)
w2v_model.build_vocab(sentences_stem)
w2v_model.train(sentences_stem, total_examples=w2v_model.corpus_count, epochs=w2v_model.epochs)
model_vectors_stem = w2v_model.wv
w2v_words_stem = model_vectors_stem.index_to_key
print("Vocabulary size: %i" % len(w2v_words_stem))
print("Word2Vec trained")

#print('words:',w2v_words)
Build & train Word2Vec model ...
Vocabulary size: 4217
Word2Vec trained
In [107]:
# Préparation des sentences (tokenization)

print("Fit Tokenizer ...")
tokenizer_stem = Tokenizer()
tokenizer_stem.fit_on_texts(sentences_stem)
x_sentences_stem = pad_sequences(tokenizer_stem.texts_to_sequences(sentences_stem),
                                                     maxlen=maxlen,
                                                     padding='post') 
                                                   
num_words_stem = len(tokenizer_stem.word_index) + 1
print("Number of unique words: %i" % num_words_stem)
Fit Tokenizer ...
Number of unique words: 4218
In [108]:
plt.figure(num=None, figsize=(15, 10), facecolor='w', edgecolor='k')

# fit a 2d PCA model to the vectors
X = model_vectors_stem[model_vectors_stem.index_to_key]
pca_stem = PCA(n_components=2)
result_stem = pca_stem.fit_transform(X)

# # create a scatter plot of the projection
plt.scatter(result_stem[:, 0], result_stem[:, 1])
words_stem = list(model_vectors_stem.index_to_key)

for i, word in enumerate(words_stem):
    plt.annotate(word, xy=(result_stem[i, 0], result_stem[i, 1]))

plt.show()
In [109]:
# Création de la matrice d'embedding

print("Create Embedding matrix ...")
w2v_size = 300
word_index_stem = tokenizer_stem.word_index
vocab_size = len(word_index) + 1
embedding_matrix_stem = np.zeros((vocab_size, w2v_size))
i=0
j=0
    
for word, idx in word_index_stem.items():
    i +=1
    if word in w2v_words_stem:
        j +=1
        embedding_vector = model_vectors_stem[word]
        if embedding_vector is not None:
            embedding_matrix_stem[idx] = model_vectors_stem[word]
            
word_rate = np.round(j/i,4)
print("Word embedding rate : ", word_rate)
print("Embedding matrix: %s" % str(embedding_matrix_stem.shape))
Create Embedding matrix ...
Word embedding rate :  1.0
Embedding matrix: (4638, 300)
In [110]:
# Création du modèle

input=Input(shape=(len(x_sentences_stem),maxlen),dtype='float64')
word_input_stem=Input(shape=(maxlen,),dtype='float64')  
word_embedding_stem=Embedding(input_dim=vocab_size,
                         output_dim=w2v_size,
                         weights = [embedding_matrix],
                         input_length=maxlen)(word_input_stem)
word_vec=GlobalAveragePooling1D()(word_embedding_stem)  
embed_model_stem = Model([word_input_stem],word_vec)

embed_model_stem.summary()
Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 input_4 (InputLayer)        [(None, 2524)]            0         
                                                                 
 embedding_1 (Embedding)     (None, 2524, 300)         1391400   
                                                                 
 global_average_pooling1d_1   (None, 300)              0         
 (GlobalAveragePooling1D)                                        
                                                                 
=================================================================
Total params: 1,391,400
Trainable params: 1,391,400
Non-trainable params: 0
_________________________________________________________________
In [111]:
embeddings_stem = embed_model_stem.predict(x_sentences_stem)
embeddings_stem.shape
Out[111]:
(1050, 300)
In [112]:
X_tsne_w2v_stem = tsne.fit_transform(embeddings_stem)

df_tsne_w2v_stem = pd.DataFrame(X_tsne_w2v_stem[:,0:2], columns=['tsne1', 'tsne2'])
print(df_tsne_w2v_stem.shape)
[t-SNE] Computing 241 nearest neighbors...
[t-SNE] Indexed 1050 samples in 0.024s...
[t-SNE] Computed neighbors for 1050 samples in 1.403s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1050
[t-SNE] Computed conditional probabilities for sample 1050 / 1050
[t-SNE] Mean sigma: 0.014665
[t-SNE] KL divergence after 250 iterations with early exaggeration: 54.936592
[t-SNE] KL divergence after 1600 iterations: 0.421429
(1050, 2)
In [113]:
plot_kmeans_tsne(df_tsne_w2v_stem,
                 "Cluster Kmeans based on stemmed word2vec",
                "Kmeans_word2vec_stemmed", "word2vec_stemmed") 
Adjusted Rand Index: 0.306
In [114]:
# Analyse des différentes catégories dans les labels
index_tot = [data[data['cluster word2vec_stemmed'] == x].index
             for x in data['cluster word2vec_stemmed'].value_counts().index]

plt.figure(figsize=(20, 20))
for x in range(len(index_tot)):
    order = data.loc[index_tot[x], 'product_category_1'].value_counts()
    order_hue = order.index
    plt.subplot(4, len(index_tot)/3, x+1)
    sns.countplot(y=data.loc[index_tot[x], 'product_category_1'],
                  order=order_hue,
                  palette='Blues_r')
    plt.title(f"Cluster {x}", fontsize=20)

BERT

Le modèle BERT génère des plongements qui permettent d'avoir plusieurs représentations vectorielles pour le même mot, en fonction du contexte dans lequel le mot est utilisé.

In [115]:
# Fonction de préparation des sentences
def bert_inp_fct(sentences, bert_tokenizer, max_length) :
    input_ids=[]
    token_type_ids = []
    attention_mask=[]
    bert_inp_tot = []

    for sent in sentences:
        bert_inp = bert_tokenizer.encode_plus(sent,
                                              add_special_tokens = True,
                                              max_length = max_length,
                                              padding='max_length',
                                              return_attention_mask = True, 
                                              return_token_type_ids=True,
                                              truncation=True,
                                              return_tensors="tf")
    
        input_ids.append(bert_inp['input_ids'][0])
        token_type_ids.append(bert_inp['token_type_ids'][0])
        attention_mask.append(bert_inp['attention_mask'][0])
        bert_inp_tot.append((bert_inp['input_ids'][0], 
                             bert_inp['token_type_ids'][0], 
                             bert_inp['attention_mask'][0]))

    input_ids = np.asarray(input_ids)
    token_type_ids = np.asarray(token_type_ids)
    attention_mask = np.array(attention_mask)
    
    return input_ids, token_type_ids, attention_mask, bert_inp_tot
    

# Fonction de création des features
def feature_BERT_fct(model, model_type, sentences, max_length, b_size, mode='HF') :
    batch_size = b_size
    batch_size_pred = b_size
    bert_tokenizer = AutoTokenizer.from_pretrained(model_type)
    time1 = time.time()

    for step in range(len(sentences)//batch_size) :
        idx = step*batch_size
        input_ids, token_type_ids, attention_mask, bert_inp_tot = bert_inp_fct(sentences[idx:idx+batch_size], 
                                                                      bert_tokenizer, max_length)
        
        if mode=='HF' :    # Bert HuggingFace
            outputs = model.predict([input_ids, attention_mask, token_type_ids], batch_size=batch_size_pred)
            last_hidden_states = outputs.last_hidden_state

        if mode=='TFhub' : # Bert Tensorflow Hub
            text_preprocessed = {"input_word_ids" : input_ids, 
                                 "input_mask" : attention_mask, 
                                 "input_type_ids" : token_type_ids}
            outputs = model(text_preprocessed)
            last_hidden_states = outputs['sequence_output']
             
        if step ==0 :
            last_hidden_states_tot = last_hidden_states
            last_hidden_states_tot_0 = last_hidden_states
        else :
            last_hidden_states_tot = np.concatenate((last_hidden_states_tot,last_hidden_states))
    
    features_bert = np.array(last_hidden_states_tot).mean(axis=1)
    
    time2 = np.round(time.time() - time1,0)
    print("temps traitement : ", time2)
     
    return features_bert, last_hidden_states_tot
In [116]:
# Guide sur le Tensorflow hub : https://www.tensorflow.org/text/tutorials/classify_text_with_bert
#model_url = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4'
model_url = hub.load("bert_en_uncased_L-12_H-768_A-12_4/")
bert_layer = hub.KerasLayer(model_url, trainable=True)

lemmatize

In [117]:
sentences = data['lemmatize'].to_list()
In [118]:
max_length = 64
batch_size = 10
model_type = 'bert-base-uncased'
model = bert_layer

features_bert, last_hidden_states_tot = feature_BERT_fct(model, model_type, sentences, 
                                                         max_length, batch_size, mode='TFhub')
temps traitement :  100.0
In [119]:
X_tsne_bert = tsne.fit_transform(features_bert)

df_tsne_bert = pd.DataFrame(X_tsne_bert[:,0:2], columns=['tsne1', 'tsne2'])
print(df_tsne_bert.shape)
[t-SNE] Computing 241 nearest neighbors...
[t-SNE] Indexed 1050 samples in 0.040s...
[t-SNE] Computed neighbors for 1050 samples in 2.231s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1050
[t-SNE] Computed conditional probabilities for sample 1050 / 1050
[t-SNE] Mean sigma: 1.472073
[t-SNE] KL divergence after 250 iterations with early exaggeration: 57.370544
[t-SNE] KL divergence after 1900 iterations: 0.529820
(1050, 2)
In [120]:
plt.figure(figsize=(15,8))

plot_kmeans_tsne(df_tsne_bert,
                 "Cluster Kmeans based on lemmatized BERT",
                "Kmeans_BERT_lem", "BERT_lem") 
Adjusted Rand Index: 0.343
In [121]:
# Analyse des différentes catégories dans les labels
index_tot = [data[data['cluster BERT_lem'] == x].index
             for x in data['cluster BERT_lem'].value_counts().index]

plt.figure(figsize=(20, 20))
for x in range(len(index_tot)):
    order = data.loc[index_tot[x], 'product_category_1'].value_counts()
    order_hue = order.index
    plt.subplot(4, len(index_tot)/3, x+1)
    sns.countplot(y=data.loc[index_tot[x], 'product_category_1'],
                  order=order_hue,
                  palette='Blues_r')
    plt.title(f"Cluster {x}", fontsize=20)

stemming

In [122]:
sentences_stem = data['porter_stemmed'].to_list()
In [123]:
features_bert_stem, last_hidden_states_tot_stem = feature_BERT_fct(model, model_type, sentences_stem, 
                                                         max_length, batch_size, mode='TFhub')
temps traitement :  113.0
In [124]:
X_tsne_bert_stem = tsne.fit_transform(features_bert_stem)

df_tsne_bert_stem = pd.DataFrame(X_tsne_bert_stem[:,0:2], columns=['tsne1', 'tsne2'])
print(df_tsne_bert_stem.shape)
[t-SNE] Computing 241 nearest neighbors...
[t-SNE] Indexed 1050 samples in 0.049s...
[t-SNE] Computed neighbors for 1050 samples in 2.698s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1050
[t-SNE] Computed conditional probabilities for sample 1050 / 1050
[t-SNE] Mean sigma: 1.287478
[t-SNE] KL divergence after 250 iterations with early exaggeration: 57.324543
[t-SNE] KL divergence after 1750 iterations: 0.596094
(1050, 2)
In [125]:
plt.figure(figsize=(15,8))

plot_kmeans_tsne(df_tsne_bert_stem,
                 "Cluster Kmeans based on stemmed BERT",
                "Kmeans_BERT_stem", "BERT_stem") 
Adjusted Rand Index: 0.338
In [126]:
# Analyse des différentes catégories dans les labels
index_tot = [data[data['cluster BERT_stem'] == x].index
             for x in data['cluster BERT_stem'].value_counts().index]

plt.figure(figsize=(20, 20))
for x in range(len(index_tot)):
    order = data.loc[index_tot[x], 'product_category_1'].value_counts()
    order_hue = order.index
    plt.subplot(4, len(index_tot)/3, x+1)
    sns.countplot(y=data.loc[index_tot[x], 'product_category_1'],
                  order=order_hue,
                  palette='Blues_r')
    plt.title(f"Cluster {x}", fontsize=20)

USE (Universal Sentence Encoder)

USE calcule une représentation vectorielle d’un texte, cette représentation respecte la proximité sémantique (similarité) des textes entre eux. Le modèle permets d’identifier l’importance des mots dans un contexte en fonction de leur position et de leur identité.

In [127]:
def feature_USE_fct(sentences, b_size) :
    batch_size = b_size

    for step in range(len(sentences)//batch_size) :
        idx = step*batch_size
        feat = embed(sentences[idx:idx+batch_size])

        if step ==0 :
            features = feat
        else :
            features = np.concatenate((features,feat))

    return features

Lemmatize

In [128]:
batch_size = 10
sentences = data["lemmatize"].to_list()
In [130]:
tf_tensor = embed(sentences)
df_use = pd.DataFrame(tens.make_ndarray(tens.make_tensor_proto(tf_tensor)),
                      index=data.index,
                      columns=['dim'+str(i) for i in range(512)])
In [131]:
tsne_results_use = tsne.fit_transform(df_use) 

df_tsne_use = pd.DataFrame(tsne_results_use[:,0:2], columns=['tsne1', 'tsne2'])
print(df_tsne_use.shape)
[t-SNE] Computing 241 nearest neighbors...
[t-SNE] Indexed 1050 samples in 0.019s...
[t-SNE] Computed neighbors for 1050 samples in 1.071s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1050
[t-SNE] Computed conditional probabilities for sample 1050 / 1050
[t-SNE] Mean sigma: 0.401216
[t-SNE] KL divergence after 250 iterations with early exaggeration: 57.701965
[t-SNE] KL divergence after 3700 iterations: 0.484854
(1050, 2)
In [132]:
plt.figure(figsize=(15,8))

plot_kmeans_tsne(df_tsne_use,
                 "Cluster Kmeans based on lemmatized USE",
                "Kmeans_USE_lem", "USE_lem") 
Adjusted Rand Index: 0.428
Ce graphique montre que le modèle arrive à différencier plusieurs catégories avec un ARI score de 0.428.
In [133]:
# Analyse des différentes catégories dans les labels
index_tot = [data[data['cluster USE_lem'] == x].index
             for x in data['cluster USE_lem'].value_counts().index]

plt.figure(figsize=(20, 20))
for x in range(len(index_tot)):
    order = data.loc[index_tot[x], 'product_category_1'].value_counts()
    order_hue = order.index
    plt.subplot(4, len(index_tot)/3, x+1)
    sns.countplot(y=data.loc[index_tot[x], 'product_category_1'],
                  order=order_hue,
                  palette='Blues_r')
    plt.title(f"Cluster {x}", fontsize=20)

stemming

In [134]:
sentences_stem = data["porter_stemmed"].to_list()
In [135]:
features_USE_stem = feature_USE_fct(sentences_stem, batch_size)
In [136]:
features_USE_stem
Out[136]:
array([[ 0.02943964,  0.01765896, -0.02141097, ...,  0.05872097,
        -0.02617009, -0.0327165 ],
       [-0.00145227,  0.03880886, -0.07883427, ..., -0.05704426,
         0.00656295, -0.05974566],
       [-0.04211751,  0.06859342, -0.00776462, ..., -0.02375256,
        -0.02260967, -0.08099879],
       ...,
       [-0.02453436,  0.01296988, -0.0550962 , ..., -0.00492339,
        -0.02162995, -0.04616145],
       [ 0.01556673, -0.06905145,  0.03799624, ..., -0.00134891,
        -0.00774331, -0.0572109 ],
       [-0.00668812, -0.05716998,  0.02141155, ...,  0.03495907,
        -0.01074826, -0.08236039]], dtype=float32)
In [137]:
X_tsne_use_stem = tsne.fit_transform(features_USE_stem)

df_tsne_use_stem = pd.DataFrame(X_tsne_use_stem[:,0:2], columns=['tsne1', 'tsne2'])
print(df_tsne_use_stem.shape)
[t-SNE] Computing 241 nearest neighbors...
[t-SNE] Indexed 1050 samples in 0.023s...
[t-SNE] Computed neighbors for 1050 samples in 1.093s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1050
[t-SNE] Computed conditional probabilities for sample 1050 / 1050
[t-SNE] Mean sigma: 0.394346
[t-SNE] KL divergence after 250 iterations with early exaggeration: 58.502724
[t-SNE] KL divergence after 2550 iterations: 0.469928
(1050, 2)
In [138]:
plt.figure(figsize=(15,8))

plot_kmeans_tsne(df_tsne_use_stem,
                 "Cluster Kmeans based on stemmed USE",
                "Kmeans_USE_stem", "USE_stem") 
Adjusted Rand Index: 0.419
In [139]:
# Analyse des différentes catégories dans les labels
index_tot = [data[data['cluster USE_stem'] == x].index
             for x in data['cluster USE_stem'].value_counts().index]

plt.figure(figsize=(20, 20))
for x in range(len(index_tot)):
    order = data.loc[index_tot[x], 'product_category_1'].value_counts()
    order_hue = order.index
    plt.subplot(4, len(index_tot)/3, x+1)
    sns.countplot(y=data.loc[index_tot[x], 'product_category_1'],
                  order=order_hue,
                  palette='Blues_r')
    plt.title(f"Cluster {x}", fontsize=20)
In [140]:
list_aris = [*list_ari, *list_ari2]
In [141]:
df_ari=pd.DataFrame([list_aris]
                    ,columns=['km_pca_tf_lem','km_pca_tf_idf_lem','km_lda_tf_lem','km_lda_tf_idf_lem',
                    'km_pca_tf_stem','km_pca_tf_idf_stem','km_lda_tf_stem','km_lda_tf_idf_stem',
                    'km_word2vec_lem','km_word2vec_stem','km_bert_lem','km_bert_stem','km_use_lem','km_use_stem'],
                    index=['ARI_SCORE'])
In [142]:
df_ari.T.round(2).plot(kind="bar",figsize=(10,6))
plt.xlabel("Model")
plt.ylabel("ARI Score")
Out[142]:
Text(0, 0.5, 'ARI Score')
Le modèle Kmeans avec le USE et lemmatisé donne le meilleur résultat avec 0.432
In [143]:
df_use.to_csv("Flipkart/df_use.csv")
In [144]:
data.to_csv("Flipkart/data_cleaned.csv")
In [145]:
df_ari.to_csv("Flipkart/ari.csv")
In [ ]: